In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

def lgb_model_init(x_train,y_train):
    lgb_clf=LGBMClassifier()
    lgb_clf.fit(x_train,y_train)
    
    return lgb_clf

def lgb_model_tuned(x_train,y_train):
    
    grid_search = {'num_leaves': [11,21,31],
               'max_depth': [5,8,10],
               'min_data_in_leaf': [10, 15, 20],
               'learning_rate': [0.1,0.003,0.001],
               'n_estimators': [50,100]}
    clf = LGBMClassifier()
    grid = GridSearchCV(estimator = clf, param_grid = grid_search, 
                               cv = 4, verbose= 5, n_jobs = -1)
    grid.fit(x_train,y_train)
    
    lgb_model=grid.best_estimator_
    return lgb_model
In [2]:
data = pd.read_csv('preprocessing1.csv', encoding = 'cp949', index_col=0 )
seed = 5764

target=data['Status']
data.drop(['Status'],axis=1,inplace = True)

# 학습, 테스트 데이터 분리 (0.7:0.3)
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=seed)
In [3]:
lgb_model = lgb_model_tuned(x_train,y_train)
Fitting 4 folds for each of 162 candidates, totalling 648 fits
[LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=10
In [4]:
import numpy as np
from lime import lime_tabular

lgb_lime_explainer = lime_tabular.LimeTabularExplainer(np.array(x_train),feature_names = data.columns , mode="classification")
for i in range(0,10):
    lgb_lime_explanation = lgb_lime_explainer.explain_instance(x_test.iloc[i], lgb_model.predict_proba)
    lgb_lime_explanation.show_in_notebook(show_table=True)
    print(i)
0
1
2
3
4
5
6
7
8
9
In [47]:
import shap 

lgb_shap_explainer = shap.TreeExplainer(lgb_model)
shap_values = lgb_shap_explainer.shap_values(x_test)
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
In [48]:
shap_values0 = shap_values[0]
shap_values1 = shap_values[1]
In [46]:
pd.DataFrame(shap_values)
Out[46]:
0 1 2 3 4 5 6 7 8 9 ... 53 54 55 56 57 58 59 60 61 62
0 -0.060913 5.938066 0.033142 1.337879 -0.027559 -0.106786 0.021367 -0.001228 0.037628 -0.039497 ... 0.018534 -0.067606 -0.011855 0.000306 0.0 0.0 0.0 0.000042 0.0 0.0
1 -0.100589 1.817809 0.293854 1.327279 -0.018856 -0.039895 -0.043687 -0.000038 -0.111681 -0.013843 ... -0.016512 0.000143 0.027499 -0.002004 0.0 0.0 0.0 -0.000036 0.0 0.0
2 0.031435 -0.244622 0.277961 -2.480536 -0.019976 -0.036943 -0.047571 -0.000558 -0.066861 -0.110250 ... -0.003020 0.000013 -0.062603 0.000276 0.0 0.0 0.0 -0.000036 0.0 0.0
3 -0.027382 -1.116766 0.089582 0.444481 -0.035768 -0.047052 0.044000 -0.000556 -0.043766 -0.185097 ... 0.011086 0.000013 -0.003684 0.000306 0.0 0.0 0.0 -0.000023 0.0 0.0
4 -0.127619 -1.709845 -0.031520 0.342881 -0.082172 -0.113986 -0.047293 -0.000097 -0.096669 -0.116064 ... 0.003405 0.000013 -0.005900 0.000394 0.0 0.0 0.0 -0.000070 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
43190 -0.068260 -0.434048 0.431488 0.749769 -0.040857 -0.056899 -0.025563 -0.000027 -0.096319 -0.131242 ... 0.016858 -0.067606 0.002202 0.000111 0.0 0.0 0.0 0.000043 0.0 0.0
43191 -0.028327 -1.323152 -0.603167 0.558277 -0.094224 -0.143546 0.021240 -0.000075 0.405080 -0.057729 ... 0.003336 -0.067606 -0.043265 0.000298 0.0 0.0 0.0 0.000042 0.0 0.0
43192 0.003222 -1.274914 -0.733685 0.548409 -0.085359 -0.148556 0.015578 0.000037 -0.011205 -0.073217 ... 0.001393 -0.067606 -0.052286 0.000405 0.0 0.0 0.0 -0.000035 0.0 0.0
43193 -0.139284 -0.600632 -0.267923 -2.245942 -0.014664 -0.049018 -0.051033 -0.001116 -0.055401 -0.116513 ... -0.020422 -0.067606 0.079443 -0.001919 0.0 0.0 0.0 -0.000070 0.0 0.0
43194 -0.033462 -1.149843 0.330163 0.162047 -0.023018 -0.037735 -0.065641 0.000041 -0.042708 -0.099078 ... -0.006054 -0.067606 0.040033 -0.001517 0.0 0.0 0.0 -0.000036 0.0 0.0

43195 rows × 63 columns

In [49]:
shap.summary_plot(shap_values1, x_test)
In [52]:
shap.dependence_plot('rate_of_interest', shap_values1, x_test)
In [54]:
shap.dependence_plot('Upfront_charges', shap_values1, x_test)